setwd("/Volumes/project/bioinformatics/JLee_lab/s224730/Marek/shRNA_18_samples_Human_aug2023/outputs/readCounts/Deseq2_without_C1/Analysis_1.4_9.4/")
library(DESeq2)
library(edgeR)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(gplots)
library(gridExtra)
library("biomaRt")
library(corrplot)
library(ggrepel)
library("plotly")#, lib.loc="~/R/win-library/3.5")
library(biomaRt)
library(knitr)
library(VennDiagram)
library(ggpubr)
library(limma)
library(reshape2)
library(readxl)
library(kableExtra)
library(EnhancedVolcano)

Data <- read.csv("readCount.tsv",
                header = TRUE, 
                sep = '\t',
                skip=1)

rownames(Data) <- Data$Geneid
#View(Data)

colnames(Data) <- c("Geneid","Chr","Start","End","Strand","Length","C1_FXNSH2","C1_FXNSH3","C1_Novirus","C1_SCR","C2_FXNSH2","C2_FXNSH3", "C2_Novirus", "C2_SCR", "C3_FXNSH2", "C3_FXNSH3", "C3_Novirus", "C3_SCR", "F1_Novirus", "F1_SCR", "F2_Novirus", "F2_SCR", "F3_Novirus", "F3_SCR")

##=== countData
countData <- as.matrix(Data[,-c(1:10)])
mode(countData) <- "integer"


colData <- read.csv("info1.csv", sep=",", row.names=1)

#View(countData)
#View(colData)

GeneID_table <- Data[,c(1:6)]

#View(GeneID_table)
##===Generate a Gene Symbol to Ensembl/Entrez ID conversion table
#ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
#Symbol_table <- getBM(attributes=c("hgnc_symbol","ensembl_gene_id","entrezgene_id"), filters = 'hgnc_symbol', values = rownames(GeneID_table), mart = ensembl)
#write.table(Symbol_table, file="Symbol_table.txt", quote=FALSE, sep="\t")

Data preprocessing, filtering, and normalization

##===Check all sample IDs in colData are also in CountData and match their orders
all(rownames(colData) %in% colnames(countData))
## [1] TRUE
countData <- countData[, rownames(colData)]
all(rownames(colData) == colnames(countData))
## [1] TRUE
#=load a Symbol_table saved in advance
Symbol_table <- read.table("Symbol_table.txt", sep="\t")#, na.strings="")
idx_na <- which(is.na(Symbol_table$entrezgene_id))
Symbol_table <- as.matrix(Symbol_table)
Symbol_table[idx_na,2] <- Symbol_table[idx_na,1]

#=Add features oringinally not contained
idx <- which(!(rownames(countData) %in% Symbol_table[,3]))
tmp <- rownames(countData)[idx]
tmp <- cbind(tmp, tmp, tmp)
Symbol_table <- rbind(Symbol_table, tmp)

Symbol_table <- data.frame(Symbol_table)
rownames(Symbol_table) <- c(1:length(rownames(Symbol_table)))

##===Gene filtering
#countData_cpm <- t(t(countData)/colSums(countData))*1e6
#keep_g <- rowSums(countData_cpm > 1) > 5
#dim(countData[keep_g,])
#countData <- countData[keep_g,]

##===Data normalization
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design= ~ Groups)
#vsd <- vst(dds, blind=FALSE)
#rld <- rlog(dds, blind=FALSE); vsd<-rld
#head(assay(vsd), 3)

PART A. DESeq analysis

  1. DESeq Analyses
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design= ~ Groups)
dds <- DESeq(dds, test = "Wald")
# Assuming 'dds' is already defined and DESeq analysis has been performed
# ...

# Get normalized counts
#normalized_counts <- counts(dds, normalized = TRUE)

# If you want raw counts instead of normalized counts, use:
# raw_counts <- counts(dds, normalized = FALSE)

# Convert normalized counts to a data frame
#normalized_counts_df <- as.data.frame(normalized_counts)

# Save normalized counts to a CSV file
#write.csv(normalized_counts_df, file = "normalized_counts_for_gsea.csv", row.names = TRUE)

Groups: Patient vs Control, F.SCR vs C.SCR, FXNsh2 vs C.SCR, FXNsh3 vs C.SCR, FXNsh2 vs FXNsh3, FXNsh2 vs F.SCR, FXNsh3 vs F.SCR, FXNsh vs C.SCR, FXNsh vs F.SCR

  1. Analsis 1.4: Patient vs Control
#Generate an MA plot for Patient vs Control

res_Patient_vs_Control <- results(dds, contrast = c("Groups", "Patient", "Control"), pAdjustMethod = "BH")
res_Patient_vs_Control <- res_Patient_vs_Control[order(res_Patient_vs_Control$pvalue),]
summary(res_Patient_vs_Control)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 736, 2.1%
## LFC < 0 (down)     : 347, 1%
## outliers [1]       : 11, 0.032%
## low counts [2]     : 19394, 56%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 1.4) MA plot

res_Patient_vs_Control <- data.frame(res_Patient_vs_Control)
res_Patient_vs_Control$Symbol <- rownames(res_Patient_vs_Control)

df_MA1 <- res_Patient_vs_Control[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA1, main = "Patient vs Control", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA1$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 1.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_Patient_vs_Control, x="log2FoldChange", y="pvalue", lab=res_Patient_vs_Control$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

  1. Analysis 2.4) F.SCR vs. C.SCR
#Generate an MA plot for  F.SCR vs C.SCR

res_F.SCR_vs_C.SCR <- results(dds, contrast = c("Groups", "F.SCR", "C.SCR"), pAdjustMethod = "BH")
res_F.SCR_vs_C.SCR <- res_F.SCR_vs_C.SCR[order(res_F.SCR_vs_C.SCR$padj),]
summary(res_F.SCR_vs_C.SCR)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 96, 0.28%
## LFC < 0 (down)     : 68, 0.2%
## outliers [1]       : 11, 0.032%
## low counts [2]     : 18101, 52%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 2.4) MA Plot

res_F.SCR_vs_C.SCR <- data.frame(res_F.SCR_vs_C.SCR)
res_F.SCR_vs_C.SCR$Symbol <- rownames(res_F.SCR_vs_C.SCR)

df_MA1 <- res_F.SCR_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA1, main = "F.SCR vs C.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA1$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 2.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_F.SCR_vs_C.SCR, x="log2FoldChange", y="pvalue", lab=res_F.SCR_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

  1. Analysis 3.4) FXNsh2 vs C.SCR
#Generate an MA plot for FXNSH2 vs C.SCR

res_FXNsh2_vs_C.SCR <- results(dds, contrast = c("Groups", "FXNsh2", "C.SCR"), pAdjustMethod = "BH")
res_FXNsh2_vs_C.SCR <- res_FXNsh2_vs_C.SCR[order(res_FXNsh2_vs_C.SCR$padj),]
summary(res_FXNsh2_vs_C.SCR)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2824, 8.1%
## LFC < 0 (down)     : 3399, 9.8%
## outliers [1]       : 11, 0.032%
## low counts [2]     : 16808, 48%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 3.4) MA Plot

res_FXNsh2_vs_C.SCR <- data.frame(res_FXNsh2_vs_C.SCR)
res_FXNsh2_vs_C.SCR$Symbol <- rownames(res_FXNsh2_vs_C.SCR)

df_MA2 <- res_FXNsh2_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA2, main = "FXNsh2 vs C.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA2$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 3.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_FXNsh2_vs_C.SCR, x="log2FoldChange", y="pvalue",lab=res_FXNsh2_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

  1. Analysis 4.4) FXNsh3 vs C.SCR
#Generate an MA plot for FXNsh3 vs C.SCR

res_FXNsh3_vs_C.SCR <- results(dds, contrast = c("Groups", "FXNsh3", "C.SCR"), pAdjustMethod = "BH")
res_FXNsh3_vs_C.SCR <- res_FXNsh3_vs_C.SCR[order(res_FXNsh3_vs_C.SCR$padj),]
summary(res_FXNsh3_vs_C.SCR)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1372, 4%
## LFC < 0 (down)     : 1612, 4.6%
## outliers [1]       : 11, 0.032%
## low counts [2]     : 20687, 60%
## (mean count < 13)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 4.4) MA Plot

res_FXNsh3_vs_C.SCR <- data.frame(res_FXNsh3_vs_C.SCR)
res_FXNsh3_vs_C.SCR$Symbol <- rownames(res_FXNsh3_vs_C.SCR)

df_MA1 <- res_FXNsh3_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA1, main = "FXNsh3 vs C.SCR", fdr = 0.05, fc = 1.5, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA1$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 4.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_FXNsh3_vs_C.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh3_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

Analysis 5.4) Fxnsh2 vs Fxnsh3

#Generate an MA plot for Novirus vs FXNSH2

res_FXNsh2_vs_FXNsh3 <- results(dds, contrast = c("Groups", "FXNsh2", "FXNsh3"), pAdjustMethod = "BH")
res_FXNsh2_vs_FXNsh3 <- res_FXNsh2_vs_FXNsh3[order(res_FXNsh2_vs_FXNsh3$pvalue),]
summary(res_FXNsh2_vs_FXNsh3)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2462, 7.1%
## LFC < 0 (down)     : 2871, 8.3%
## outliers [1]       : 11, 0.032%
## low counts [2]     : 17454, 50%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 5.4) MA Plot

res_FXNsh2_vs_FXNsh3 <- data.frame(res_FXNsh2_vs_FXNsh3)
res_FXNsh2_vs_FXNsh3$Symbol <- rownames(res_FXNsh2_vs_FXNsh3)

df_MA3 <- res_FXNsh2_vs_FXNsh3[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh2_vs_FXNsh3", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 5.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_FXNsh2_vs_FXNsh3, x="log2FoldChange", y="pvalue", lab=res_FXNsh2_vs_FXNsh3$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

Analysis 6.4) Fxnsh2 vs F.SCR

#Generate an MA plot for Novirus vs FXNSH2

res_FXNsh2_vs_F.SCR <- results(dds, contrast = c("Groups", "FXNsh2", "F.SCR"), pAdjustMethod = "BH")
res_FXNsh2_vs_F.SCR <- res_FXNsh2_vs_F.SCR[order(res_FXNsh2_vs_F.SCR$pvalue),]
summary(res_FXNsh2_vs_F.SCR)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3727, 11%
## LFC < 0 (down)     : 4148, 12%
## outliers [1]       : 11, 0.032%
## low counts [2]     : 14869, 43%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 6.4) MA Plot

res_FXNsh2_vs_F.SCR <- data.frame(res_FXNsh2_vs_F.SCR)
res_FXNsh2_vs_F.SCR$Symbol <- rownames(res_FXNsh2_vs_F.SCR)

df_MA3 <- res_FXNsh2_vs_F.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh2_vs_F.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 6.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_FXNsh2_vs_F.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh2_vs_F.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

Analysis 7.4) Fxnsh3 vs F.SCR

#Generate an MA plot for Novirus vs FXNSH2

res_FXNsh3_vs_F.SCR <- results(dds, contrast = c("Groups", "FXNsh3", "F.SCR"), pAdjustMethod = "BH")
res_FXNsh3_vs_F.SCR <- res_FXNsh2_vs_FXNsh3[order(res_FXNsh3_vs_F.SCR$pvalue),]
summary(res_FXNsh3_vs_F.SCR)
##     baseMean        log2FoldChange       lfcSE            stat        
##  Min.   :     0.0   Min.   :-7.660   Min.   :0.075   Min.   :-20.129  
##  1st Qu.:     0.0   1st Qu.:-0.558   1st Qu.:0.337   1st Qu.: -0.452  
##  Median :     0.1   Median : 0.000   Median :1.483   Median :  0.000  
##  Mean   :   179.0   Mean   : 0.021   Mean   :2.531   Mean   : -0.043  
##  3rd Qu.:     7.9   3rd Qu.: 0.684   3rd Qu.:5.683   3rd Qu.:  0.518  
##  Max.   :378100.3   Max.   : 7.794   Max.   :5.771   Max.   : 14.965  
##                     NA's   :25877    NA's   :25877   NA's   :25877    
##      pvalue           padj          Symbol         
##  Min.   :0.000   Min.   :0.00    Length:60583      
##  1st Qu.:0.157   1st Qu.:0.05    Class :character  
##  Median :0.631   Median :0.35    Mode  :character  
##  Mean   :0.542   Mean   :0.40                      
##  3rd Qu.:0.842   3rd Qu.:0.73                      
##  Max.   :1.000   Max.   :1.00                      
##  NA's   :25888   NA's   :43342

Analysis 7.4) MA Plot

res_FXNsh3_vs_F.SCR <- data.frame(res_FXNsh3_vs_F.SCR)
res_FXNsh3_vs_F.SCR$Symbol <- rownames(res_FXNsh3_vs_F.SCR)

df_MA3 <- res_FXNsh3_vs_F.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh3_vs_F.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 7.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_FXNsh3_vs_F.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh3_vs_F.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

DESeq Analyses

dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design= ~ Group1)
dds <- DESeq(dds, test = "Wald")

Analysis 8.4) FXNsh vs F.SCR

#Generate an MA plot for Novirus vs FXNSH2

res_FXNsh_vs_F.SCR <- results(dds, contrast = c("Group1", "FXNsh", "F.SCR"), pAdjustMethod = "BH")
res_FXNsh_vs_F.SCR <- res_FXNsh_vs_F.SCR[order(res_FXNsh_vs_F.SCR$pvalue),]
summary(res_FXNsh_vs_F.SCR)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2823, 8.1%
## LFC < 0 (down)     : 2859, 8.2%
## outliers [1]       : 32, 0.092%
## low counts [2]     : 15515, 45%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 8.4) MA Plot

res_FXNsh_vs_F.SCR <- data.frame(res_FXNsh_vs_F.SCR)
res_FXNsh_vs_F.SCR$Symbol <- rownames(res_FXNsh_vs_F.SCR)

df_MA3 <- res_FXNsh_vs_F.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh vs F.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 8.4) Volcano Plot

#Volcano Plot

EnhancedVolcano(res_FXNsh_vs_F.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh_vs_F.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)

Analysis 9.4) FXNsh vs C.SCR

#Generate an MA plot for SCR vs Novirus
res_FXNsh_vs_C.SCR <- results(dds, contrast = c("Group1", "FXNsh", "C.SCR"), pAdjustMethod = "BH")
res_FXNsh_vs_C.SCR <- res_FXNsh_vs_C.SCR[order(res_FXNsh_vs_C.SCR$pvalue),]
summary(res_FXNsh_vs_C.SCR)
## 
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1433, 4.1%
## LFC < 0 (down)     : 1524, 4.4%
## outliers [1]       : 32, 0.092%
## low counts [2]     : 19391, 56%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Analysis 9.4) MA Plot

res_FXNsh_vs_C.SCR <- data.frame(res_FXNsh_vs_C.SCR)
res_FXNsh_vs_C.SCR$Symbol <- rownames(res_FXNsh_vs_C.SCR)

df_MA4 <- res_FXNsh_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA4, main = "FXNsh vs C.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
         palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA4$Symbol), top = 20, ylim=c(-10,10), 
         xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).

Analysis 9.4) Volcano PLot

#Volcano Plot

EnhancedVolcano(res_FXNsh_vs_C.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)